Notebook code prepared by Jacob Tlalka with some after adjustments from L.Tlalka - 2020, April
import math
import datetime
import requests
import wget
import os
import pandas as pd
from bs4 import BeautifulSoup
import plotly.offline as py
COUNTRIES_OF_INTEREST = ['Poland', 'Czech Republic', 'United Kingdom', 'Switzerland', 'Japan',
'United States', 'France', 'Italy', 'Spain', 'Germany', 'Netherlands',
'Iran', 'India', 'Norway', 'South Korea', 'Sweden',
'Belgium', 'Austria', 'Denmark', 'China']
# getting data with arrays of pandemia cases from the 31.12.2019 listed for each country on the
# globe (complete set of countries)
if os.path.exists('total_cases.csv'):
os.remove('total_cases.csv')
DATA_URL = 'https://covid.ourworldindata.org/data/ecdc/total_cases.csv'
wget.download(DATA_URL) # easy for downloading files without need opening the destination file!
df = pd.read_csv('total_cases.csv')
DAYS = df.shape[0]
DATE = df['date'][DAYS - 1]
print(DAYS, DATE)
#df.shape
df.tail(3)
# deleting 2 columns loaded to dataframe as not being the country names
countries = list(df.columns)
countries = list(set(countries).difference(set(['date','World'])))
len(countries)
#sorted(countries)
# scraping the still refreshed numbers on pandemia course from the most actual set of data
# run in the table on www.worldometers page
soup = BeautifulSoup(
requests.get('https://www.worldometers.info/coronavirus/').content, 'html.parser')
main_table = soup.find(id='main_table_countries_today').find_all('tr')
# preparing coherent names for both sources of data - to be used further
COUNTRY_MAP = {
'S. Korea': 'South Korea',
'UK': 'United Kingdom',
'USA': 'United States',
'UAE': 'United Arab Emirates',
'Czechia': 'Czech Republic',
}
# selecting columns from the main table + cleaning - for potential using in further analysis
t_col_tags = list(main_table[0].children)[1::2]
table_col_names = [value.text.replace('\xa0','al').replace('\n','').strip()
for index, value in enumerate(t_col_tags)]
col_names = list(enumerate(table_col_names))
for i, col in col_names: print(f'{i}. {col}')
# preparing two sets of data separately for continents and for countries
continents_table = main_table[1:7] + [main_table[8]] #index 7 has unknown, not meaningful data
countries_table = main_table[9:]
world_values = [value.text for value in list(main_table[8].children)[1::2]]
for name, value in zip(table_col_names, world_values):
print(f'{name}: {value}')
# using data from continents_table to prepare it for visualization
cont_scores, cont_deaths = {}, {}
cont_names = []
for row in continents_table:
row_tags = list(row.children)[1::2]
row = [value.text.strip() for value in row_tags]
continent, death_cnt = row[12], row[3]
cont_names.append(continent)
cases = int(row[1].replace(',', ''))
cont_scores[continent] = cases
if death_cnt:
cont_deaths[continent] = int(death_cnt.replace(',',''))
print(cont_names)
# selecting some data from countries table
country_scores, deaths, population = {}, {}, {}
# dictionary for continents and total population
cont_pop = {continent:0 for continent in cont_names}
for row in countries_table:
row_tags = list(row.children)[1::2]
row = [value.text.strip() for value in row_tags]
country, death_cnt, continent = row[0], row[3], row[12]
country = COUNTRY_MAP.get(country, country)
cases = int(row[1].replace(',',''))
country_scores[country] = cases
if death_cnt:
deaths[country] = int(death_cnt.replace(',',''))
density = row[8].replace(',','')
if density:
population[country] = round(cases * 10**6 / float(density), 0)
cont_pop[continent] += population[country]
#country_scores
for country in ['Poland','Czech Republic','Slovakia','Germany','Ukraine','Lithuania','Sweden']:
print(f'{country}: {country_scores[country]} cases')
print(f"Number of countries: {len(country_scores)}\n")
c_name = 'Spain'
print(f"Country name: {c_name}\t All cases: {country_scores[c_name]}\
\tDeaths: {deaths[c_name]}\tPopulation: {population[c_name]}\n")
print('Calculated population on continents:')
for k,v in cont_pop.items(): print(f'{k}: {v:,}')
# present data for continents aggregated into the pandas dataframe
cont_df = pd.DataFrame(
zip(cont_names, cont_pop.values(), cont_scores.values(), cont_deaths.values()),
columns=['Continent','Population','Total cases', 'Death cases'])
cont_df['Total Cases/1M pop'] = round(cont_df['Total cases'] / cont_df['Population'] * 10**6, 0)
cont_df['Death Cases/1M pop'] = round(cont_df['Death cases'] / cont_df['Population'] * 10**6, 0)
cont_df
# plot for continents data with plotly.graph_objects
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=['Total cases', 'Death cases'])
fig.add_trace(go.Bar(
x=cont_df['Continent'], y=cont_df['Total Cases/1M pop'], showlegend=False),
row=1, col=1)
fig.add_trace(go.Bar(
x=cont_df['Continent'], y=cont_df['Death Cases/1M pop'], showlegend=False),
row=1, col=2)
fig.update_layout(
title=dict(text='Pandemia numbers per 1M of population for each continent', x=0.5))
fig.show()
# adding - to 'country_scores'- names of countries from 'total_cases.csv' which are not present
# in statistics of www.worldometers + listing names of those countries
no_score = []
for country in countries:
if country not in country_scores:
country_scores[country] = None
no_score.append(country)
print("Number of countries in country_score dictionary:", len(country_scores))
print()
print(f"{len(no_score)} countries from 'total_cases.csv' list has been added to \
'country_scores' dictionary with None values:")
print()
print(no_score)
# removing from country_scores dictionary countries not listed in dataframe 'df'
countries_removed = []
countries_with_scores = list(country_scores.keys())
for country in countries_with_scores:
if country not in countries:
countries_removed.append((country, country_scores.pop(country)))
country_scores['World'] = None
country_scores['date'] = str((datetime.datetime.strptime(DATE, "%Y-%m-%d") + datetime.timedelta(days=1)).date())
print(f"Current number of countries in 'country_scores' dictionary: {len(country_scores)}")
print()
print(f"{len(countries_removed)} countries removed from the 'country_scores' dictionary as not listed in 'total_cases.csv':\n", countries_removed)
# new dataframe with only one day date representing present values from 'country_scores'
# dictionary and index number as the next value of df dataframe index
country_scores_df = pd.DataFrame(country_scores, index=[df.shape[0]], columns=df.columns)
country_scores_df['World'] = int(world_values[1].replace(',',''))
country_scores_df
# adding 2 dataframes vertically gives the uniformed dataframe containing data of both sources
latest_df = pd.concat([df, country_scores_df], axis=0)
print(latest_df.shape)
latest_df.tail(3)
# smoothing data: first counting differences from the day before (in a rolling way) and next
# taking mean values using rolling for 6 days
diff_df = latest_df.rolling(2).apply(lambda x: x[1] - x[0], raw=True).rolling(6).mean().round(0)
diff_df.tail(3)
# similar as above but instead of differences there is counted rolling percent for daily changes
diff_prc_df = latest_df.rolling(2).apply(
lambda x: (x[1] / x[0] - 1.0) * 100.0, raw=True).rolling(6).mean().round(0)
list(diff_prc_df['Italy'][-15:])
H = 24
# smoothed_df has values counted by algorithm in lambda function(to simulate tendency)
# and then they are shifted to the row of the day before
smoothed_df = latest_df.rolling(3).apply(
lambda x: round(x[0] * 0.1 + x[1] * 0.5 + x[2] * 0.4, 0), raw=True).shift(-1)
smoothed_df.tail()
# supplementing the last row of NaN values with data of the last day in 'latest_df' dataframe plus
# adding 'date' column
smoothed_df.iloc[-1] = latest_df.iloc[-1]
smoothed_df['date'] = latest_df['date']
smoothed_df.tail()
# simulating the change in data by taking hour units (not real values but counted as algorithm below)
sdf = smoothed_df.copy().drop(['date'], axis=1)
sers = []
for date in range(len(sdf) - 1):
for h in range(H):
sers.append(round((sdf.iloc[date] * (H - h) + sdf.iloc[date + 1] * h) / H, 1))
hour_df = pd.concat(sers, axis=1).T
hour_df.tail()
def select_countries(range_min=None, range_max=None, top_num=None):
all_scores = country_scores.items()
all_scores = list(filter(lambda x: type(x[1]) == int, all_scores))
if range_min:
all_scores = list(filter(lambda x: x[1] >= range_min, all_scores))
if range_max:
all_scores = list(filter(lambda x: x[1] <= range_max, all_scores))
all_scores.sort(key=lambda x: x[1], reverse=True)
if top_num:
all_scores = all_scores[:top_num]
return [x[0] for x in all_scores]
# function returns the death proportion in total number of cases
def death_ratio(country):
if country in deaths and country in country_scores:
return round(deaths[country] / country_scores[country],3)
return 0.0
# displaying the countries of 60 top with its death ratio numbers
country_drs = [(country, death_ratio(country)) for country in select_countries(top_num=60)]
country_drs.sort(key=lambda x: x[1], reverse=True)
df_drs = pd.DataFrame(country_drs*100, columns=['Country','Death_procent'])
c_d_r = list(zip(range(1,len(country_drs)+1), country_drs))
'''
print("\t\t\tThe present death ratio from CoronaVirus Pandemia in 60 top countries:")
for i, (k,v) in c_d_r:
print(f'{i}. {k}: {v*100:.1f}%')
'''
# plot for the countries death ratios
import plotly.express as px
fig = px.bar(df_drs, x=df_drs['Country'], y=df_drs['Death_procent'], color='Death_procent',
title=dict(text='Death percentage of pandemia top 60 countries', x=0.5))
fig.show()
def show_countries(with_log=True, threshold=100, selected_countries=COUNTRIES_OF_INTEREST, data=latest_df):
dt = data.copy()
to_remove = list(set(['date', 'World']) & set(dt.columns))
dt = dt.drop(to_remove, axis=1)
for country in dt.columns:
cdf = dt[country][~dt[country].isna()]
if threshold:
cdf = cdf[dt[country] >= threshold]
cdf = cdf.reset_index()[country]
dt.drop([country], axis=1)
dt[country] = cdf
data_1 = [{'x': dt.index, 'y': dt[col], 'name': col}
for col in selected_countries]
# additional variables to show adequate titles and axes labels
if with_log:
scale='logaritmic'
else:
scale='linear'
#scale='logarithmic' if with_log else 'linear'
title_var = ['Coronavirus Pandemia in Chosen Countries - in Total',
'Coronavirus Pandemia in Chosen Countries - Daily Trend']
title = title_var[0] if data is latest_df or data is hour_df else title_var[1]
y_label_var = [f'Number of cases in {scale} scale',
'Daily changes in number of cases',
'Daily percentage changes in number of cases']
if data is latest_df or data is hour_df:
y_label = y_label_var[0]
elif data is diff_df:
y_label = y_label_var[1]
elif data is diff_prc_df:
y_label = y_label_var[2]
unit = 'Hours' if data is hour_df else 'Days'
layout = {'title':{'text':title, 'x':0.5, 'y':0.9},
'xaxis':{'title':f'{unit} which have gone from crossing a number of {threshold} confirmed cases'},
'yaxis':{'title': y_label}}
fig = {'data': data_1, 'layout': layout}
if with_log:
layout.update(yaxis=dict(type="log", title=y_label_var[0]))
py.iplot(fig)
#py.iplot(data_1, filename='./cufflinks/simple-line.html', image_width=800, image_height=600)
show_countries()
# total numbers for first 30 countries
for i, (cases,country) in enumerate(sorted(
[(country_scores[country] or 0, country) for country in countries], reverse=True), 1):
if i <= 30:
print(f'{i}. {country} {cases}')
show_countries(selected_countries=select_countries(range_min=10000), data=hour_df, threshold=100, with_log=False)
show_countries(
selected_countries=['Poland', 'Czech Republic', 'Italy', 'Germany', 'France', 'United Kingdom',
'United States', 'Spain', 'Switzerland', 'Japan'],
data=hour_df, threshold=1000, with_log=False)
show_countries(selected_countries=
['Italy', 'Spain', 'United States', 'Poland'], data=diff_df, threshold=1, with_log=False)
show_countries(
selected_countries=['Poland', 'Czech Republic'], data=diff_df, threshold=1, with_log=False)
show_countries(selected_countries=select_countries(top_num=10), threshold=100, data=diff_df, with_log=False)
show_countries(selected_countries=select_countries(top_num=30), threshold=1000, data=hour_df)
show_countries(selected_countries=select_countries(top_num=32), threshold=300, data=hour_df, with_log=False)
show_countries(selected_countries=['Poland', 'Czech Republic', 'Italy', 'Germany', 'France', 'United Kingdom', 'United States',
'Spain', 'Switzerland', 'Japan'],
data=hour_df, threshold=300, with_log=False)
show_countries(selected_countries=['Poland', 'Czech Republic', 'Italy', 'Germany', 'France', 'United Kingdom',
'United States', 'Spain', 'Switzerland', 'Japan', 'China','India'],
data=diff_prc_df, threshold=0, with_log=False)
show_countries(selected_countries=['Poland', 'Czech Republic', 'France', 'Italy', 'Spain'],
data=diff_prc_df, threshold=0, with_log=False)